• DOMAIN: Healthcare
• CONTEXT: Medical research university X is undergoing a deep research on patients with certain conditions. University has an internal AI team. Due to confidentiality the patient’s details and the conditions are masked by the client by providing different datasets to the AI team for developing a AIML model which can predict the condition of the patient depending on the received test results.
• DATA DESCRIPTION: The data consists of biomechanics features of the patients according to their current conditions. Each patient is represented in the data set by six biomechanics attributes derived from the shape and orientation of the condition to their body part.
• PROJECT OBJECTIVE: Demonstrate the ability to fetch, process and leverage data to generate useful predictions by training Supervised Learning algorithms.
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
from sklearn.model_selection import train_test_split # Sklearn package's randomized data splitting function
from sklearn.neighbors import KNeighborsClassifier
from scipy.stats import zscore
from scipy import stats
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, roc_curve, accuracy_score
from sklearn.model_selection import train_test_split, GridSearchCV, RepeatedStratifiedKFold
Normal_DF = pd.read_csv('Part1 - Normal.csv')
Type_H_DF = pd.read_csv('Part1 - Type_H.csv')
Type_S_DF = pd.read_csv('Part1 - Type_S.csv')
print('Normal_DF shape is : {}'.format(Normal_DF.shape))
print('Type_H_DF shape is : {}'.format(Type_H_DF.shape))
print('Type_S_DF shape is : {}'.format(Type_S_DF.shape))
From Shape it is clearly visible that all the CSV Files have same number of columns i.e. 7
No of rows in Part1 - Normal.csv is 100,
No of rows in Part1 - Type_H.csv is 60,
No of rows in Part1 - Type_S.csv is 150,
Normal_DF.info()
Type_H_DF.info()
Type_S_DF.info()
From info command, it is clearly visible that all the files have same columns i.e.: P_incidence, P_tilt, L_angle, S_slope, P_radius, S_degree, Class,
All the columns are float except Class type which is object
Also we can see that all the columns are non-null but we need to see if it is having any garbage values or not to confirm that if the data is ready for analysis.
Concatenate Function can be used to merge all the three Data frames i.e. Normal_DF, Type_H_DF and Type_S_DF
Total no of rows should be 100 + 60 +150 = 310
Merged_Health_DF = pd.concat([Normal_DF,Type_H_DF,Type_S_DF]) # All the three Dataframes are merged onto one another
print('Merged_Health_DF shape is : {}'.format(Merged_Health_DF.shape)) # Final Shape of the merged Data frame
Merged_Health_DF.info()
From Above, We can see that all the Data Frames are merged on top of one another
Total number of rows are 310
Total number of columns are 7
Merged_Health_DF.describe(include='all')
Merged_Health_DF.head()
# Checking the unique values in the Class column
Merged_Health_DF.Class.unique()
Normal is also written as Nrmal
Type_H is also written as type_h
Type_S is also written as tp_s
Merged_Health_DF.Class = Merged_Health_DF.Class.replace('Nrmal','Normal')
Merged_Health_DF.Class = Merged_Health_DF.Class.replace('type_h','Type_H')
Merged_Health_DF.Class = Merged_Health_DF.Class.replace('tp_s','Type_S')
# Check for the unique values in Class column after replace action
Merged_Health_DF.Class.unique()
Merged_Health_DF.info()
Int64Index: 310 entries, 0 to 149
Merged_Health_DF.head() # Check the start of the index
Merged_Health_DF.tail() # Check the end of the index
Merged_Health_DF.reset_index(drop=True, inplace=True)
Merged_Health_DF.info()
RangeIndex: 310 entries, 0 to 309
label_Encoder = LabelEncoder()
Merged_Health_DF['Class'] = label_Encoder.fit_transform(Merged_Health_DF['Class'])
Merged_Health_DF['Class'].unique()
Merged_Health_DF.head()
### we can see that Class column is now converted into INT64 data type with the values as 0,1,2.
#Checking the presence of any null values
Merged_Health_DF.isnull().sum()
# Different method to check the NUll values where the data type is bool
Merged_Health_DF.isnull().any()
Merged_Health_DF.info()
Merged_Health_DF.describe(include='all')
# Check the skewness for Merged_Health_DF Dataframe
# P_incidence, L_angle, S_slope and S_Degree are right skeyed. (Outliers present towards right) -
# same be viewed in the Histograms below
Merged_Health_DF.hist(figsize=(15,30)); # as per the histograms we can see that P_incidence, L_angle, S_slope and S_Degree are right skeyed
Merged_Health_DF.corr()
sns.heatmap(Merged_Health_DF.corr(),annot=True);
In above plot, yellow colour represents maximum correlation and blue colour represents minimum correlation.
fig = plt.figure(figsize = (15,10))
plt.subplot(2, 2, 1)
sns.distplot(Merged_Health_DF.P_incidence); # Plotting the histogram for P_incidence
plt.title("Distribution of P_incidence")
plt.subplot(2, 2, 2)
sns.boxplot(Merged_Health_DF.P_incidence)
plt.title("BOXPLOT of P_incidence")
plt.subplot(2, 2, 3)
sns.boxplot(x=Merged_Health_DF.Class, y=Merged_Health_DF.P_incidence)
plt.subplot(2, 2, 4)
sns.stripplot(x=Merged_Health_DF.Class, y=Merged_Health_DF.P_incidence)
plt.show()
# We can see that there are outliers for P_incidence and we can remove them from the dataset
Merged_Health_Out = Merged_Health_DF
# P_incidence_IQR =
P_incidence_Q1 = Merged_Health_Out.P_incidence.quantile(0.25)
P_incidence_Q3 = Merged_Health_Out.P_incidence.quantile(0.75)
P_incidence_IQR = P_incidence_Q3-P_incidence_Q1
# Lower Fence and Upper fence
Upper_Fence1 = P_incidence_Q3 + (1.5 * P_incidence_IQR)
Lower_Fence1 = P_incidence_Q1 - (1.5 * P_incidence_IQR)
Merged_Health_Out = Merged_Health_Out[Merged_Health_Out.P_incidence <= Upper_Fence1]
# remove the Outliers from the Merged dataframe
Merged_Health_Out.Class.count()
# Earlier total number of rows were 310 and after removal of the 3 outliers we can see that dataframe is having 307 rows
fig = plt.figure(figsize = (15,10))
plt.subplot(2, 2, 1)
sns.distplot(Merged_Health_DF.S_slope); # Plotting the histogram for P_incidence
plt.title("Distribution of S_slope")
plt.subplot(2, 2, 2)
sns.boxplot(Merged_Health_DF.S_slope)
plt.title("BOXPLOT of S_slope")
plt.subplot(2, 2, 3)
sns.boxplot(x=Merged_Health_DF.Class, y=Merged_Health_DF.S_slope)
plt.subplot(2, 2, 4)
sns.stripplot(x=Merged_Health_DF.Class, y=Merged_Health_DF.S_slope)
plt.show()
# Removing the Outliers from S_slope =
S_slope_Q1 = Merged_Health_Out.S_slope.quantile(0.25)
S_slope_Q3 = Merged_Health_Out.S_slope.quantile(0.75)
S_slope_IQR = S_slope_Q3-S_slope_Q1
# Lower Fence and Upper fence
Upper_Fence2 = S_slope_Q3 + (1.5 * S_slope_IQR)
Lower_Fence2 = S_slope_Q1 - (1.5 * S_slope_IQR)
Merged_Health_Out = Merged_Health_Out[Merged_Health_Out.S_slope <= Upper_Fence2]
Merged_Health_Out = Merged_Health_Out[Merged_Health_Out.S_slope >= Lower_Fence2]
Merged_Health_Out.Class.count()
# Plotting the histogram for L_angle
fig = plt.figure(figsize = (15,10))
plt.subplot(2, 2, 1)
sns.distplot(Merged_Health_DF.L_angle);
plt.title("Distribution of L_angle")
plt.subplot(2, 2, 2)
sns.boxplot(Merged_Health_DF.L_angle)
plt.title("BOXPLOT of L_angle")
plt.subplot(2, 2, 3)
sns.boxplot(x=Merged_Health_DF.Class, y=Merged_Health_DF.L_angle)
plt.subplot(2, 2, 4)
sns.stripplot(x=Merged_Health_DF.Class, y=Merged_Health_DF.L_angle)
plt.show()
# 1 outlier in the L_angle
# L_angle_IQR =
L_angle_Q1 = Merged_Health_Out.L_angle.quantile(0.25)
L_angle_Q3 = Merged_Health_Out.L_angle.quantile(0.75)
L_angle_IQR = L_angle_Q3-L_angle_Q1
# Lower Fence and Upper fence
Upper_Fence3 = L_angle_Q3 + (1.5 * L_angle_IQR)
Lower_Fence3 = L_angle_Q1 - (1.5 * L_angle_IQR)
# remove the Outliers from the Merged_Health_Out data frame
Merged_Health_Out = Merged_Health_Out[Merged_Health_Out.L_angle <= Upper_Fence3]
Merged_Health_Out = Merged_Health_Out[Merged_Health_Out.L_angle >= Lower_Fence3]
Merged_Health_Out.Class.count()
# total no of rows are 306 now
#### Univariate plot for
# Plotting the histogram for S_Degree
fig = plt.figure(figsize = (15,10))
plt.subplot(2, 2, 1)
sns.distplot(Merged_Health_DF.S_Degree);
plt.title("Distribution of S_Degree")
plt.subplot(2, 2, 2)
sns.boxplot(Merged_Health_DF.S_Degree)
plt.title("BOXPLOT of S_Degree")
plt.subplot(2, 2, 3)
sns.boxplot(x=Merged_Health_DF.Class, y=Merged_Health_DF.S_Degree)
plt.subplot(2, 2, 4)
sns.stripplot(x=Merged_Health_DF.Class, y=Merged_Health_DF.S_Degree)
plt.show()
# Removin the outlliers from S_Degree
S_Degree_Q1 = Merged_Health_Out.S_Degree.quantile(0.25)
S_Degree_Q3 = Merged_Health_Out.S_Degree.quantile(0.75)
S_Degree_IQR = S_Degree_Q3-S_Degree_Q1
# Lower Fence and Upper fence
Upper_Fence4 = S_Degree_Q3 + (1.5 * S_Degree_IQR)
Lower_Fence4 = S_Degree_Q1 - (1.5 * S_Degree_IQR)
# remove the Outliers from the Merged_Health_1 data frame
Merged_Health_Out = Merged_Health_Out[Merged_Health_Out.S_Degree <= Upper_Fence4]
Merged_Health_Out = Merged_Health_Out[Merged_Health_Out.S_Degree >= Lower_Fence4]
Merged_Health_Out.Class.count()
# Plotting the histogram for P_radius
fig = plt.figure(figsize = (15,10))
plt.subplot(2, 2, 1)
sns.distplot(Merged_Health_DF.P_radius);
plt.title("Distribution of P_radius")
plt.subplot(2, 2, 2)
sns.boxplot(Merged_Health_DF.P_radius)
plt.title("BOXPLOT of P_radius")
plt.subplot(2, 2, 3)
sns.boxplot(x=Merged_Health_DF.Class, y=Merged_Health_DF.P_radius)
plt.subplot(2, 2, 4)
sns.stripplot(x=Merged_Health_DF.Class, y=Merged_Health_DF.P_radius)
plt.show()
# Removing the Outliers from P_radius
P_radius_Q1 = Merged_Health_Out.P_radius.quantile(0.25)
P_radius_Q3 = Merged_Health_Out.P_radius.quantile(0.75)
P_radius_IQR = P_radius_Q3-P_radius_Q1
# Lower Fence and Upper fence
Upper_Fence5 = P_radius_Q3 + (1.5 * P_radius_IQR)
Lower_Fence5 = P_radius_Q1 - (1.5 * P_radius_IQR)
# remove the Outliers from the Merged_Health_Out data frame
Merged_Health_Out = Merged_Health_Out[Merged_Health_Out.P_radius <= Upper_Fence5]
Merged_Health_Out = Merged_Health_Out[Merged_Health_Out.P_radius >= Lower_Fence5]
Merged_Health_Out.Class.count()
# total no of rows are 286 now in Merged_Health_Out
Merged_Health_Out.hist(figsize=(15,15));
fig = plt.figure(figsize = (15,5))
plt.subplot(1, 2, 1)
sns.heatmap(Merged_Health_DF.corr(),annot=True);
plt.title("heat map with outliers")
plt.subplot(1, 2, 2)
sns.heatmap(Merged_Health_Out.corr(),annot=True);
plt.title("heat map without outliers")
plt.show()
#distribution of class values and other attributes
sns.pairplot(Merged_Health_Out,diag_kind='kde');
# We know that Class 0 is represents 'Normal', Class 1 represents 'Type_H' and Class 1 represents 'Type_S'
# Plotting the histogram for P_incidence and S_slope
fig = plt.figure(figsize = (15,10))
plt.subplot(2, 2, 1)
sns.scatterplot(Merged_Health_DF.P_incidence,Merged_Health_DF.S_slope)
plt.title("Scatter Plot of P_incidence and S_slope with outlliers")
plt.subplot(2, 2, 2)
sns.scatterplot(Merged_Health_DF.P_incidence,Merged_Health_DF.S_slope,hue=Merged_Health_DF.Class)
plt.subplot(2, 2, 3)
plt.title("Scatter Plot of P_incidence and S_slope without outlliers")
sns.scatterplot(Merged_Health_Out.P_incidence,Merged_Health_Out.S_slope)
plt.subplot(2, 2, 4)
sns.scatterplot(Merged_Health_Out.P_incidence,Merged_Health_Out.S_slope,hue=Merged_Health_Out.Class)
plt.show()
# Plotting the histogram for P_incidence and L_angle
fig = plt.figure(figsize = (15,10))
plt.subplot(2, 2, 1)
sns.scatterplot(Merged_Health_DF.P_incidence,Merged_Health_DF.L_angle)
plt.title("Scatter Plot of P_incidence and L_angle with outlliers")
plt.subplot(2, 2, 2)
sns.scatterplot(Merged_Health_DF.P_incidence,Merged_Health_DF.L_angle,hue=Merged_Health_DF.Class)
plt.subplot(2, 2, 3)
plt.title("Scatter Plot of P_incidence and L_angle without outlliers")
sns.scatterplot(Merged_Health_Out.P_incidence,Merged_Health_Out.L_angle)
plt.subplot(2, 2, 4)
sns.scatterplot(Merged_Health_Out.P_incidence,Merged_Health_Out.L_angle,hue=Merged_Health_Out.Class)
plt.show()
# Plotting the histogram for S_slope and L_angle
fig = plt.figure(figsize = (15,10))
plt.subplot(2, 2, 1)
sns.scatterplot(Merged_Health_DF.S_slope,Merged_Health_DF.L_angle)
plt.title("Scatter Plot of S_slope and L_angle with outlliers")
plt.subplot(2, 2, 2)
sns.scatterplot(Merged_Health_DF.S_slope,Merged_Health_DF.L_angle,hue=Merged_Health_DF.Class)
plt.subplot(2, 2, 3)
plt.title("Scatter Plot of S_slope and L_angle without outlliers")
sns.scatterplot(Merged_Health_Out.S_slope,Merged_Health_Out.L_angle)
plt.subplot(2, 2, 4)
sns.scatterplot(Merged_Health_Out.S_slope,Merged_Health_Out.L_angle,hue=Merged_Health_Out.Class)
plt.show()
# Plotting the histogram for S_slope and L_angle
fig = plt.figure(figsize = (15,10))
plt.subplot(2, 2, 1)
sns.scatterplot(Merged_Health_DF.S_slope,Merged_Health_DF.P_radius)
plt.title("Scatter Plot of S_slope and P_radius with outlliers")
plt.subplot(2, 2, 2)
sns.scatterplot(Merged_Health_DF.S_slope,Merged_Health_DF.P_radius,hue=Merged_Health_DF.Class)
plt.subplot(2, 2, 3)
plt.title("Scatter Plot of S_slope and P_radius without outlliers")
sns.scatterplot(Merged_Health_Out.S_slope,Merged_Health_Out.P_radius)
plt.subplot(2, 2, 4)
sns.scatterplot(Merged_Health_Out.S_slope,Merged_Health_Out.P_radius,hue=Merged_Health_Out.Class)
plt.show()
# Predictors
X = Merged_Health_DF.drop('Class',axis=1)
# Target
y = Merged_Health_DF.Class
X.info() # We can see that Class Variable is dropped from the predictor dataframe
# # convert the features into z scores as we do not know what units / scales were used and store them in new dataframe
# # It is always adviced to scale numeric attributes in models that calculate distances.
# XScaled = X.apply(zscore) # convert all attributes to Z scale
# XScaled.describe()
from sklearn.preprocessing import StandardScaler
sc=StandardScaler()
XScaled = sc.fit_transform(X)
# Tried with both Standard scaler and Z score, Results are same
y.count()
# Below is the classification of Class 0, 1 and 2 in the final data frame now
unique, counts = np.unique(y, return_counts=True)
print(np.asarray((unique, counts)).T)
# Split X and y into training and test set in 70:30 ratio
# Startify is used to make sure that data is classided proportionally basis on the classes
X_train, X_test, y_train, y_test = train_test_split(XScaled, y, test_size=0.30,stratify=y)
unique, counts = np.unique(y_train, return_counts=True)
print(np.asarray((unique, counts)).T)
y_train.count()
unique, counts = np.unique(y_test, return_counts=True)
print(np.asarray((unique, counts)).T)
y_test.count()
# planning to split the data basis on the 70:30 ratio where training is 70% and testing data is 30%
print('Class 0 training data is {}'.format(len(y_train[y_train == 0])/len(y_train)*100))
print('Class 1 training data is {}'.format(len(y_train[y_train == 1])/len(y_train)*100))
print('Class 2 training data is {}'.format(len(y_train[y_train == 2])/len(y_train)*100))
print('Class 0 testing data is {}'.format(len(y_test[y_test == 0])/len(y_test)*100))
print('Class 1 testing data is {}'.format(len(y_test[y_test == 1])/len(y_test)*100))
print('Class 2 testing data is {}'.format(len(y_test[y_test == 2])/len(y_test)*100))
# Lets choose the K basis on the simple formula i.e. Square root of n (n = number of data points in training data)
# Square root(217) and choose the odd number i.e. 15
KNN = KNeighborsClassifier(n_neighbors= 15,weights = 'uniform', metric = 'euclidean')
# Call Nearest Neighbour algorithm
KNN.fit(X_train, y_train)
# For every test data point, predict it's label based on 15 nearest neighbours in this model. The majority class will
# be assigned to the test data point
predicted_labels = KNN.predict(X_test)
print('Testing Accuracy when K = 15 is {}'.format(KNN.score(X_test, y_test)))
print('Training Accuracy when K = 15 is {}'.format(KNN.score(X_train, y_train)))
# calculate accuracy measures and confusion matrix
from sklearn import metrics
print(metrics.confusion_matrix(y_test, predicted_labels))
print("Confusion Matrix")
cm=metrics.confusion_matrix(y_test, predicted_labels, labels=[0,1,2])
df_cm = pd.DataFrame(cm, index = [i for i in [0,1,2]],
columns = [i for i in ["Predict 0","Predict 1","Predict 2"]])
plt.figure(figsize = (5,5))
sns.heatmap(df_cm, annot=True)
#import classification_report
from sklearn.metrics import classification_report
print(classification_report(y_test,predicted_labels))
scores =[]
for k in range(1,50):
NNH = KNeighborsClassifier(n_neighbors = k,weights = 'uniform', metric = 'euclidean')
NNH.fit(X_train, y_train)
scores.append(NNH.score(X_test, y_test))
plt.plot(range(1,50),scores)
K= np.arange(1, 50)
train_accuracy_plot = np.empty(len(K))
test_accuracy_plot = np.empty(len(K))
for i, k in enumerate(K):
knn = KNeighborsClassifier(n_neighbors=k,weights = 'uniform', metric = 'euclidean')
knn.fit(X_train,y_train)
train_accuracy_plot[i] = knn.score(X_train,y_train)
test_accuracy_plot[i] = knn.score(X_test,y_test)
# Generate plot
plt.title('k-NN: Varying Number of Neighbors')
plt.plot(K, test_accuracy_plot, label = 'Testing Accuracy')
plt.plot(K, train_accuracy_plot, label = 'Training Accuracy')
plt.legend()
plt.xlabel('Number of Neighbors')
plt.ylabel('Accuracy')
plt.show()
from sklearn.model_selection import GridSearchCV
#List Hyperparameters that we want to tune.
leaf_size = list(range(1,50))
n_neighbors = list(range(1,30))
p=[1,2]
#Convert to dictionary
hyperparameters = dict(leaf_size=leaf_size, n_neighbors=n_neighbors, p=p)
#Create new KNN object
knn_2 = KNeighborsClassifier()
#Use GridSearch
clf = GridSearchCV(knn_2, hyperparameters, cv=10)
#Fit the model
best_model = clf.fit(X_train, y_train)
#Print The value of best Hyperparameters
print('Best leaf_size:', best_model.best_estimator_.get_params()['leaf_size'])
print('Best p:', best_model.best_estimator_.get_params()['p'])
print('Best n_neighbors:', best_model.best_estimator_.get_params()['n_neighbors'])
predict = best_model.predict(X_test)
print('Testing Accuracy when K = 3 is {}'.format(best_model.score(X_test, y_test)))
print('Training Accuracy when K = 3 is {}'.format(best_model.score(X_train, y_train)))
print(classification_report(y_test,predict))
print("Confusion Matrix")
cm=metrics.confusion_matrix(y_test, predict, labels=[0,1,2])
df_cm = pd.DataFrame(cm, index = [i for i in [0,1,2]],
columns = [i for i in ["Predict 0","Predict 1","Predict 2"]])
plt.figure(figsize = (7,5))
sns.heatmap(df_cm, annot=True)
# Predictors
X_out = Merged_Health_Out.drop('Class',axis=1)
# Target
y_out = Merged_Health_Out.Class
X_out.info() # We can see that Class Variable is dropped from the predictor dataframe
# # convert the features into z scores as we do not know what units / scales were used and store them in new dataframe
# # It is always adviced to scale numeric attributes in models that calculate distances.
# XScaled = X.apply(zscore) # convert all attributes to Z scale
# XScaled.describe()
from sklearn.preprocessing import StandardScaler
sc=StandardScaler()
XScaled_out = sc.fit_transform(X_out)
# Tried with both Standard scaler and Z score, Results are same
# Below is the classification of Class 0, 1 and 2 in the data frame where the outliers are removed
unique, counts = np.unique(y_out, return_counts=True)
print(np.asarray((unique, counts)).T)
# Split X and y into training and test set in 70:30 ratio
# Startify is used to make sure that data is classided proportionally basis on the classes
X_train_out, X_test_out, y_train_out, y_test_out = train_test_split(XScaled_out, y_out, test_size=0.30,stratify=y_out)
unique, counts = np.unique(y_train_out, return_counts=True)
print(np.asarray((unique, counts)).T)
y_train_out.count()
unique, counts = np.unique(y_test_out, return_counts=True)
print(np.asarray((unique, counts)).T)
y_test_out.count()
# planning to split the data basis on the 70:30 ratio where training is 70% and testing data is 30%
print('Class 0 training data is {} when the outliers are removed'.format(len(y_train_out[y_train_out == 0])/len(y_train_out)*100))
print('Class 1 training data is {} when the outliers are removed'.format(len(y_train_out[y_train_out == 1])/len(y_train_out)*100))
print('Class 2 training data is {} when the outliers are removed'.format(len(y_train_out[y_train_out == 2])/len(y_train_out)*100))
print('Class 0 testing data is {} when the outliers are removed'.format(len(y_test_out[y_test_out == 0])/len(y_test_out)*100))
print('Class 1 testing data is {} when the outliers are removed'.format(len(y_test_out[y_test_out == 1])/len(y_test_out)*100))
print('Class 2 testing data is {} when the outliers are removed'.format(len(y_test_out[y_test_out == 2])/len(y_test_out)*100))
KNN_out = KNeighborsClassifier(n_neighbors= 15,weights = 'uniform', metric = 'euclidean')
# Call Nearest Neighbour algorithm
KNN_out.fit(X_train_out, y_train_out)
# For every test data point, predict it's label based on 15 nearest neighbours in this model. The majority class will
# be assigned to the test data point
predicted_labels_out = KNN_out.predict(X_test_out)
print('Testing Accuracy when K = 15 is {}'.format(KNN.score(X_test_out, y_test_out)))
print('Training Accuracy when K = 15 is {}'.format(KNN.score(X_train_out, y_train_out)))
# calculate accuracy measures and confusion matrix
from sklearn import metrics
print(metrics.confusion_matrix(y_test_out, predicted_labels_out))
print("Confusion Matrix")
cm=metrics.confusion_matrix(y_test_out, predicted_labels_out, labels=[0,1,2])
df_cm = pd.DataFrame(cm, index = [i for i in [0,1,2]],
columns = [i for i in ["Predict 0","Predict 1","Predict 2"]])
plt.figure(figsize = (5,5))
sns.heatmap(df_cm, annot=True)
#import classification_report
from sklearn.metrics import classification_report
print(classification_report(y_test_out,predicted_labels_out))
scores =[]
for k in range(1,50):
NNH = KNeighborsClassifier(n_neighbors = k,weights = 'uniform', metric = 'euclidean')
NNH.fit(X_train_out, y_train_out)
scores.append(NNH.score(X_test_out, y_test_out))
plt.plot(range(1,50),scores)
K= np.arange(1, 50)
train_accuracy_plot = np.empty(len(K))
test_accuracy_plot = np.empty(len(K))
for i, k in enumerate(K):
knn = KNeighborsClassifier(n_neighbors=k,weights = 'uniform', metric = 'euclidean')
knn.fit(X_train_out,y_train_out)
train_accuracy_plot[i] = knn.score(X_train_out,y_train_out)
test_accuracy_plot[i] = knn.score(X_test_out,y_test_out)
# Generate plot
plt.title('k-NN: Varying Number of Neighbors')
plt.plot(K, test_accuracy_plot, label = 'Testing Accuracy')
plt.plot(K, train_accuracy_plot, label = 'Training Accuracy')
plt.legend()
plt.xlabel('Number of Neighbors')
plt.ylabel('Accuracy')
plt.show()
from sklearn.model_selection import GridSearchCV
#List Hyperparameters that we want to tune.
leaf_size = list(range(1,50))
n_neighbors = list(range(1,30))
p=[1,2]
#Convert to dictionary
hyperparameters = dict(leaf_size=leaf_size, n_neighbors=n_neighbors, p=p)
#Create new KNN object
knn_3 = KNeighborsClassifier()
#Use GridSearch
clf = GridSearchCV(knn_3, hyperparameters, cv=3)
#Fit the model
best_model_out = clf.fit(X_train_out, y_train_out)
#Print The value of best Hyperparameters
print('Best leaf_size:', best_model_out.best_estimator_.get_params()['leaf_size'])
print('Best p:', best_model_out.best_estimator_.get_params()['p'])
print('Best n_neighbors:', best_model_out.best_estimator_.get_params()['n_neighbors'])
predict_out = best_model_out.predict(X_test_out)
print('Testing Accuracy when K = 9 is {}'.format(best_model_out.score(X_test_out, y_test_out)))
print('Training Accuracy when K = 9 is {}'.format(best_model_out.score(X_train_out, y_train_out)))
print(classification_report(y_test_out,predict_out))
print("Confusion Matrix")
cm=metrics.confusion_matrix(y_test_out, predict_out, labels=[0,1,2])
df_cm = pd.DataFrame(cm, index = [i for i in [0,1,2]],
columns = [i for i in ["Predict 0","Predict 1","Predict 2"]])
plt.figure(figsize = (7,5))
sns.heatmap(df_cm, annot=True)
=================================================================================================================
• DOMAIN: Banking and finance
• CONTEXT: A bank X is on a massive digital transformation for all its departments. Bank has a growing customer base whee majority of them are liability customers (depositors) vs borrowers (asset customers). The bank is interested in expanding the borrowers base rapidly to bring in more business via loan interests. A campaign that the bank ran in last quarter showed an average single digit conversion rate. Digital transformation being the core strength of the business strategy, marketing department wants to devise effective campaigns with better target marketing to increase the conversion ratio to double digit with same budget as per last campaign.
• DATA DESCRIPTION: The data consists of the following attributes:
Build an AIML model to perform focused marketing by predicting the potential customers who will convert using the historical dataset.
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
from scipy.stats import zscore
from scipy import stats
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, roc_curve, accuracy_score
from sklearn.model_selection import train_test_split, GridSearchCV, RepeatedStratifiedKFold
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
Bank_Part1 = pd.read_csv('Part2 - Data1.csv')
Bank_Part2 = pd.read_csv('Part2 -Data2.csv')
print('Bank_Part1 shape is : {}'.format(Bank_Part1.shape))
print('Bank_Part2 shape is : {}'.format(Bank_Part2.shape))
Bank_Part1.info()
Bank_Part2.info()
Also, we can see that LoanOnCard column is having 4980 non-null values out of 5000. it means there could be 20 NULLS
Bank_Merge = pd.merge(Bank_Part1,Bank_Part2,how='outer',on='ID') # Both the Data frames are merged basis on the ID
print('Bank_Merge shape is : {}'.format(Bank_Merge.shape)) # Final Shape of the merged Data frame
Bank_Merge.info()
Bank_Merge.head()
Bank_Merge.info()
Bank_Merge.describe(include='all')
# Different method to check the NUll values where the data type is bool
Bank_Merge.isnull().any()
#Checking the presence of how many null values are present
Bank_Merge.isnull().sum()
# Total of 20 NULLS are present in LoanOnCard
# Lets replace with Zeroes
Bank_Merge.LoanOnCard = Bank_Merge.LoanOnCard.fillna('0')
#Checking the presence of how many null values are present
Bank_Merge.isnull().sum()
# now zero NULLS are present in the dataset and NULLS are replaced with zeroes
# Different method to check the NUll values where the data type is bool
Bank_Merge.isnull().any()
Bank_Merge.info()
Bank_Merge['LoanOnCard'] = Bank_Merge['LoanOnCard'].astype('int64')
Bank_Merge.info()
Bank_Merge.head()
# Data type of the LoanOnCard is changed to Int64 now as it is holding only 0 and 1
Bank_Merge.describe(include='all').T
# Check the skewness for Bank_Merge Dataframe
# HighestSpend, MonthlyAverageSpend , Mortgage are right skeyed. (Outliers present towards right) -
# Also from above summary, we can see that Min value in the Customer since is -3 which might not be possible and needs to be treated
# same be viewed in the Histograms below
Bank_Merge.hist(figsize=(15,30));
CreditCard
FixedDepositAccount
HiddenScore
InternetBanking
Level
LoanOnCard
Security
ZipCode
Age - Normally Distributed
CustomerSince - Normalyy Distributed
HighestSpend - Right skeyed
MonthlyAverageSpend - Right skeyed
Mortgage - Right skeyed
# Check for the negative values in the CustomerSince column
print('Total number of rows where CustomerSince value is less than 0 are : {}'.format(Bank_Merge[Bank_Merge.CustomerSince < 0].CustomerSince.count()))
print('Unique values in CustomerSince when less than 0 are : {}'.format(Bank_Merge[Bank_Merge.CustomerSince < 0].CustomerSince.unique()))
Bank_Merge.CustomerSince = Bank_Merge.CustomerSince.replace(-1,Bank_Merge.CustomerSince.mean())
Bank_Merge.CustomerSince = Bank_Merge.CustomerSince.replace(-2,Bank_Merge.CustomerSince.mean())
Bank_Merge.CustomerSince = Bank_Merge.CustomerSince.replace(-3,Bank_Merge.CustomerSince.mean())
# Replace in the neagtive avvoues to Mean of the CustomerSince i.e. 20.104600
# Check for the negative values in the CustomerSince column
print('Total number of rows where CustomerSince value is less than 0 are : {}'.format(Bank_Merge[Bank_Merge.CustomerSince < 0].CustomerSince.count()))
print('Unique values in CustomerSince when less than 0 are : {}'.format(Bank_Merge[Bank_Merge.CustomerSince < 0].CustomerSince.unique()))
Bank_Merge.describe(include='all').T
# lets find the correlation using the heatmap
fig = plt.subplots(figsize=(15,12))
sns.heatmap(Bank_Merge.corr(),annot=True);
1) Age and CustomerSince - .98
2) HighestSpend and MonthlyAverageSpend - .65
3) LoanOnCard and HighestSpend - .5
4) LoanOnCard and MonthlyAverageSpend - .37
5) LoanOnCard and FixedDepositAccount - .32
sns.pairplot(Bank_Merge);
##### Univariate of Age
fig = plt.figure(figsize = (15,3))
plt.subplot(1, 2, 1)
sns.distplot(Bank_Merge[Bank_Merge.LoanOnCard == 1].Age,color = 'r')
plt.title("Distribution of Age of Customers who have taken the Loan")
plt.subplot(1, 2, 2)
sns.distplot(Bank_Merge[Bank_Merge.LoanOnCard == 0].Age,color = 'g')
plt.title("Distribution of Age of Customers who have not taken the Loan")
plt.show()
# Univariate of CustomerSince
fig = plt.figure(figsize = (15,3))
plt.subplot(1, 2, 1)
sns.distplot(Bank_Merge[Bank_Merge.LoanOnCard == 1].CustomerSince,color = 'r')
plt.title("Distribution of CustomerSince for Customers who have taken the Loan")
plt.subplot(1, 2, 2)
sns.distplot(Bank_Merge[Bank_Merge.LoanOnCard == 0].CustomerSince,color = 'g')
plt.title("Distribution of CustomerSince for Customers who have not taken the Loan")
plt.show()
# Univariate of HighestSpend
fig = plt.figure(figsize = (15,3))
plt.subplot(1, 2, 1)
sns.distplot(Bank_Merge[Bank_Merge.LoanOnCard == 1].HighestSpend,color = 'r')
plt.title("Distribution of HighestSpend for customers who have taken the Loan")
plt.subplot(1, 2, 2)
sns.distplot(Bank_Merge[Bank_Merge.LoanOnCard == 0].HighestSpend,color = 'g')
plt.title("Distribution of HighestSpend for customers who have not taken the Loan")
plt.show()
# Univariate of MonthlyAverageSpend
fig = plt.figure(figsize = (16,3))
plt.subplot(1, 2, 1)
sns.distplot(Bank_Merge[Bank_Merge.LoanOnCard == 1].MonthlyAverageSpend,color = 'r')
plt.title("Distribution of MonthlyAverageSpend for customers who have taken the Loan")
plt.subplot(1, 2, 2)
sns.distplot(Bank_Merge[Bank_Merge.LoanOnCard == 0].MonthlyAverageSpend,color = 'g')
plt.title("Distribution of MonthlyAverageSpend for customers who have not taken the Loan")
plt.show()
# Univariate of Mortgage
fig = plt.figure(figsize = (15,3))
plt.subplot(1, 2, 1)
sns.distplot(Bank_Merge[Bank_Merge.LoanOnCard == 1].Mortgage,color = 'r')
plt.title("Distribution of Mortgage for customers who have taken the Loan")
plt.subplot(1, 2, 2)
sns.distplot(Bank_Merge[Bank_Merge.LoanOnCard == 0].Mortgage,color = 'g')
plt.title("Distribution of Mortgage for customers who have not taken the Loan")
plt.show()
# Box Plots for HighestSpend, MonthlyAverageSpend, Mortgage where Customer has NOT taken the Loan
fig = plt.figure(figsize = (15,5))
plt.subplot(1, 3, 1)
sns.boxplot(y=Bank_Merge[Bank_Merge.LoanOnCard == 0].HighestSpend,color = 'r');
plt.title("HighestSpend and Loan Not Taken")
plt.subplot(1, 3, 2)
sns.boxplot(y=Bank_Merge[Bank_Merge.LoanOnCard == 0].MonthlyAverageSpend,color = 'g');
plt.title("MonthlyAverageSpend and Loan Not Taken")
plt.subplot(1, 3, 3)
sns.boxplot(y=Bank_Merge[Bank_Merge.LoanOnCard == 0].Mortgage,color = 'b');
plt.title("Mortgage and Loan Not Taken")
plt.show()
# Box Plots for HighestSpend, MonthlyAverageSpend, Mortgage where Customer has taken the Loan
fig = plt.figure(figsize = (15,5))
plt.subplot(1, 3, 1)
sns.boxplot(y=Bank_Merge[Bank_Merge.LoanOnCard == 1].HighestSpend,color = 'r');
plt.title("HighestSpend and Loan Taken")
plt.subplot(1, 3, 2)
sns.boxplot(y=Bank_Merge[Bank_Merge.LoanOnCard == 1].MonthlyAverageSpend,color = 'g');
plt.title("MonthlyAverageSpend and Loan Taken")
plt.subplot(1, 3, 3)
sns.boxplot(y=Bank_Merge[Bank_Merge.LoanOnCard == 1].Mortgage,color = 'b');
plt.title("Mortgage and Loan Taken")
plt.show()
# target Customers
Target_Customers = Bank_Merge[Bank_Merge.LoanOnCard == 0]
Target_Customers = Target_Customers[Target_Customers.HighestSpend > 150]
Target_Customers = Target_Customers[Target_Customers.MonthlyAverageSpend > 5]
Target_Customers = Target_Customers[Target_Customers.Mortgage > 200]
Target_Customers.describe(include ='all').T
Target_Customers
##### Above 8 Customers could be the Target Customers
# Count Plot for CreditCard, FixedDepositAccount, HiddenScore, InternetBanking, Level, Security
fig = plt.figure(figsize = (15,8))
plt.subplot(2, 3, 1)
sns.countplot(Bank_Merge.CreditCard, hue=Bank_Merge.LoanOnCard);
plt.subplot(2, 3, 2)
sns.countplot(Bank_Merge.FixedDepositAccount, hue=Bank_Merge.LoanOnCard);
plt.subplot(2, 3, 3)
sns.countplot(Bank_Merge.HiddenScore, hue=Bank_Merge.LoanOnCard);
plt.subplot(2, 3, 4)
sns.countplot(Bank_Merge.InternetBanking, hue=Bank_Merge.LoanOnCard);
plt.subplot(2, 3, 5)
sns.countplot(Bank_Merge.Level, hue=Bank_Merge.LoanOnCard);
plt.subplot(2, 3, 6)
sns.countplot(Bank_Merge.Security, hue=Bank_Merge.LoanOnCard);
plt.show()
1st Graph - Countplot for CreditCard : Customer who have taken Loan have less credit cards
2nd Graph - Countplot for FixedDepositAccount : Customer who have taken Loan have less Fixed Deposit accounts
3rd Graph - Countplot for HiddenScore : Distribution of Load on Card is same for all the hidden scores
4th Graph - Countplot for InternetBanking : Internet banking activated for Customers who have taken Loan is mmore in comparision to Customer who have not taken the loan
5th Graph - Countplot for Level : Customer who have taken loan are more associated with Level 3 and level 2
6th Graph - Countplot for Security : Customer who have taken Loan have less security assest
# Creating a data set without the outliers
# HighestSpend_IQR =
HighestSpend_Q1 = Bank_Merge.HighestSpend.quantile(0.25)
HighestSpend_Q3 = Bank_Merge.HighestSpend.quantile(0.75)
HighestSpend_IQR = HighestSpend_Q3-HighestSpend_Q1
# Lower Fence and Upper fence
Upper_Fence1 = HighestSpend_Q3 + (1.5 * HighestSpend_IQR)
Lower_Fence1 = HighestSpend_Q1 - (1.5 * HighestSpend_IQR)
# remove the Outliers from the Bank_Merge data frame
Bank_Merge_Out =Bank_Merge
Bank_Merge_Out.drop(Bank_Merge_Out[(Bank_Merge_Out.HighestSpend > Upper_Fence1)&(Bank_Merge_Out.LoanOnCard == 0)].index,inplace=True)
Bank_Merge_Out.drop(Bank_Merge_Out[(Bank_Merge_Out.HighestSpend < Lower_Fence1) & (Bank_Merge_Out.LoanOnCard == 0)].index,inplace=True)
Bank_Merge_Out.ID.count()
# Creating a data set without the outliers
# MonthlyAverageSpend_IQR =
MonthlyAverageSpend_Q1 = Bank_Merge_Out.MonthlyAverageSpend.quantile(0.25)
MonthlyAverageSpend_Q3 = Bank_Merge_Out.MonthlyAverageSpend.quantile(0.75)
MonthlyAverageSpend_IQR = HighestSpend_Q3-HighestSpend_Q1
# Lower Fence and Upper fence
Upper_Fence2 = MonthlyAverageSpend_Q3 + (1.5 * MonthlyAverageSpend_IQR)
Lower_Fence2 = MonthlyAverageSpend_Q1 - (1.5 * MonthlyAverageSpend_IQR)
# remove the Outliers from the Bank_Merge data frame
Bank_Merge_Out.drop(Bank_Merge_Out[(Bank_Merge_Out.MonthlyAverageSpend > Upper_Fence1)&(Bank_Merge_Out.LoanOnCard == 0)].index,inplace=True)
Bank_Merge_Out.drop(Bank_Merge_Out[(Bank_Merge_Out.MonthlyAverageSpend < Lower_Fence1) & (Bank_Merge_Out.LoanOnCard == 0)].index,inplace=True)
Bank_Merge_Out.ID.count()
# Creating a data set without the outliers
# Mortgage_IQR =
Mortgage_Q1 = Bank_Merge_Out.Mortgage.quantile(0.25)
Mortgage_Q3 = Bank_Merge_Out.Mortgage.quantile(0.75)
Mortgage_IQR = HighestSpend_Q3-HighestSpend_Q1
# Lower Fence and Upper fence
Upper_Fence3 = Mortgage_Q3 + (1.5 * Mortgage_IQR)
Lower_Fence3 = Mortgage_Q1 - (1.5 * Mortgage_IQR)
# remove the Outliers from the Bank_Merge data frame
Bank_Merge_Out.drop(Bank_Merge_Out[(Bank_Merge_Out.Mortgage > Upper_Fence1)&(Bank_Merge_Out.LoanOnCard == 0)].index,inplace=True)
Bank_Merge_Out.drop(Bank_Merge_Out[(Bank_Merge_Out.Mortgage < Lower_Fence1) & (Bank_Merge_Out.LoanOnCard == 0)].index,inplace=True)
Bank_Merge_Out.ID.count()
Bank_Merge_Out.info()
# Predictors
# Dropping the ID, Age, ZipCode, HiddenScore and LoanOnCard
X = Bank_Merge.drop(['ID','Age','ZipCode','HiddenScore','LoanOnCard'],axis=1)
# Target
y = Bank_Merge['LoanOnCard']
X.describe(include='all').T # We can see that ID, Age, ZipCode, HiddenScore and LoanOnCard are dropped from the predictor dataframe
print('LoanOnCard = 1 in original dataframe are {} %'.format(len(Bank_Merge[Bank_Merge.LoanOnCard == 1])/len(Bank_Merge)*100))
print('LoanOnCard = 0 in original dataframe are {} %'.format(len(Bank_Merge[Bank_Merge.LoanOnCard == 0])/len(Bank_Merge)*100))
# Split X and y into training and test set in 70:30 ratio
# Stratify is used to make sure that data is classided proportionally basis on the classes
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, stratify=y)
print('LoanOnCard = 1 in Train dataframe are {} %'.format(len(y_train[y_train == 1])/len(y_train)*100))
print('LoanOnCard = 0 in Train dataframe are {} %'.format(len(y_train[y_train == 0])/len(y_train)*100))
print('LoanOnCard = 1 in Test dataframe are {} %'.format(len(y_test[y_test == 1])/len(y_test)*100))
print('LoanOnCard = 0 in Test dataframe are {} %'.format(len(y_test[y_test == 0])/len(y_test)*100))
# Logistic regression model
Log_reg = LogisticRegression(solver="liblinear")
Log_reg.fit(X_train, y_train)
predicted_labels = Log_reg.predict(X_test)
print('Accuracy for train dataset: {} '.format(Log_reg.score(X_train, y_train)))
print('Accuracy for test dataset: {} '.format(Log_reg.score(X_test, y_test)))
# calculate accuracy measures and confusion matrix
from sklearn import metrics
print(metrics.confusion_matrix(y_test, predicted_labels))
print("Confusion Matrix")
cm=metrics.confusion_matrix(y_test, predicted_labels, labels=[0,1])
df_cm = pd.DataFrame(cm, index = [i for i in [0,1]],
columns = [i for i in ["Predict 0","Predict 1"]])
plt.figure(figsize = (4,3))
sns.heatmap(df_cm, annot=True)
#import classification_report
from sklearn.metrics import classification_report
print(classification_report(y_test,predicted_labels))
print(metrics.confusion_matrix(y_test, predicted_labels))
# ROC Curve
roc_lr = roc_auc_score(y_test, Log_reg.predict(X_test))
fpr, tpr, thresholds = roc_curve(y_test, Log_reg.predict_proba(X_test)[:,1])
plt.figure(figsize = (10 , 5))
lable = 1
plt.plot(fpr, tpr, label = 'Logistic Regression (area =' + str(roc_lr) + ')')
plt.plot([0, 1], [0, 1])
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.legend(loc = 'lower right')
plt.show()
# Predictors
# Dropping the ID, Age, ZipCode, HiddenScore and LoanOnCard
X_out = Bank_Merge_Out.drop(['ID','Age','ZipCode','HiddenScore','LoanOnCard'],axis=1)
# Target
y_out = Bank_Merge_Out['LoanOnCard']
y_out.count()
X_out.describe(include='all').T # We can see that ID, Age, ZipCode, HiddenScore and LoanOnCard are dropped from the predictor dataframe
print('LoanOnCard = 1 in original dataframe are {} %'.format(len(Bank_Merge_Out[Bank_Merge_Out.LoanOnCard == 1])/len(Bank_Merge_Out)*100))
print('LoanOnCard = 0 in original dataframe are {} %'.format(len(Bank_Merge_Out[Bank_Merge_Out.LoanOnCard == 0])/len(Bank_Merge_Out)*100))
# Split X and y into training and test set in 70:30 ratio
# Stratify is used to make sure that data is classided proportionally basis on the classes
X_train_out, X_test_out, y_train_out, y_test_out = train_test_split(X_out, y_out, test_size=0.30, stratify=y_out)
print('LoanOnCard = 1 in Train dataframe are {} %'.format(len(y_train_out[y_train_out == 1])/len(y_train_out)*100))
print('LoanOnCard = 0 in Train dataframe are {} %'.format(len(y_train_out[y_train_out == 0])/len(y_train_out)*100))
print('LoanOnCard = 1 in Test dataframe are {} %'.format(len(y_test_out[y_test_out == 1])/len(y_test_out)*100))
print('LoanOnCard = 0 in Test dataframe are {} %'.format(len(y_test_out[y_test_out == 0])/len(y_test_out)*100))
# Logistic regression model
Log_reg_out = LogisticRegression(solver="liblinear")
Log_reg_out.fit(X_train_out, y_train_out)
predicted_labels_out = Log_reg_out.predict(X_test_out)
print('Accuracy for train dataset: {} '.format(Log_reg_out.score(X_train_out, y_train_out)))
print('Accuracy for test dataset: {} '.format(Log_reg_out.score(X_test_out, y_test_out)))
# calculate accuracy measures and confusion matrix
from sklearn import metrics
print(metrics.confusion_matrix(y_test_out, predicted_labels_out))
print("Confusion Matrix")
cm_out=metrics.confusion_matrix(y_test_out, predicted_labels_out, labels=[0,1])
df_cm_out = pd.DataFrame(cm_out, index = [i for i in [0,1]],
columns = [i for i in ["Predict 0","Predict 1"]])
plt.figure(figsize = (4,3))
sns.heatmap(df_cm_out, annot=True)
#import classification_report
from sklearn.metrics import classification_report
print(classification_report(y_test_out,predicted_labels_out))
print(metrics.confusion_matrix(y_test_out, predicted_labels_out))
# ROC Curve
roc_lr = roc_auc_score(y_test_out, Log_reg_out.predict(X_test_out))
fpr, tpr, thresholds = roc_curve(y_test_out, Log_reg_out.predict_proba(X_test_out)[:,1])
plt.figure(figsize = (10 , 5))
lable = 1
plt.plot(fpr, tpr, label = 'Logistic Regression (area =' + str(roc_lr) + ')')
plt.plot([0, 1], [0, 1])
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.legend(loc = 'lower right')
plt.show()
from sklearn.model_selection import GridSearchCV
#List Hyperparameters that we want to tune.
solvers = ['newton-cg', 'lbfgs', 'liblinear']
penalty = ['l1', 'l2']
c_values = [0.01, 0.1, 1.0, 10, 100]
#Convert to dictionary
hyperparameters = dict(solver=solvers,penalty=penalty,C=c_values)
#Create new Log object
Log_tune = LogisticRegression()
#Use GridSearch
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=42)
Grid = GridSearchCV(estimator=Log_tune, param_grid=hyperparameters, n_jobs=-1, cv=cv, scoring='accuracy',error_score=0)
#Fit the model
best_model1 = Grid.fit(X_train_out, y_train_out)
#Print The value of best Hyperparameters
print('Best parameters:', best_model1.best_params_)
print('Best Score:', best_model1.best_score_)
predict_out = best_model1.predict(X_test_out)
print('Testing Accuracy with Tuning: {}'.format(best_model1.score(X_test_out, y_test_out)))
print('Training Accuracy with Tuning: {}'.format(best_model1.score(X_train_out, y_train_out)))
print("Confusion Matrix")
cm_out1=metrics.confusion_matrix(y_test_out, predict_out, labels=[0,1])
df_cm_out1 = pd.DataFrame(cm_out1, index = [i for i in [0,1]],
columns = [i for i in ["Predict 0","Predict 1"]])
plt.figure(figsize = (4,3))
sns.heatmap(df_cm_out1, annot=True)
#import classification_report
from sklearn.metrics import classification_report
print(classification_report(y_test_out,predict_out))
print(metrics.confusion_matrix(y_test_out, predict_out))
# ROC Curve
roc_lr = roc_auc_score(y_test_out, best_model1.predict(X_test_out))
fpr, tpr, thresholds = roc_curve(y_test_out, best_model1.predict_proba(X_test_out)[:,1])
plt.figure(figsize = (10 , 5))
lable = 1
plt.plot(fpr, tpr, label = 'Logistic Regression (area =' + str(roc_lr) + ')')
plt.plot([0, 1], [0, 1])
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.legend(loc = 'lower right')
plt.show()
# Naive Bayes
NB_Model = GaussianNB()
NB_Model.fit(X_train_out, y_train_out)
predict_NB_Model = NB_Model.predict(X_test_out)
print('Testing Accuracy with Naive Bayes: {}'.format(NB_Model.score(X_test_out, y_test_out)))
print('Training Accuracy with Naive Bayes: {}'.format(NB_Model.score(X_train_out, y_train_out)))
print("Confusion Matrix")
cm_out_NB=metrics.confusion_matrix(y_test_out, predict_NB_Model, labels=[0,1])
df_cm_out_NB = pd.DataFrame(cm_out_NB, index = [i for i in [0,1]],
columns = [i for i in ["Predict 0","Predict 1"]])
plt.figure(figsize = (4,3))
sns.heatmap(df_cm_out_NB, annot=True)
#import classification_report
from sklearn.metrics import classification_report
print(classification_report(y_test_out,predict_NB_Model))
print(metrics.confusion_matrix(y_test_out, predict_NB_Model))
# ROC Curve
roc_lr = roc_auc_score(y_test_out, NB_Model.predict(X_test_out))
fpr, tpr, thresholds = roc_curve(y_test_out, NB_Model.predict_proba(X_test_out)[:,1])
plt.figure(figsize = (10 , 5))
lable = 1
plt.plot(fpr, tpr, label = 'Naive bayes (area =' + str(roc_lr) + ')')
plt.plot([0, 1], [0, 1])
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.legend(loc = 'lower right')
plt.show()